provides regular expression matching operations similar to those found in Perl


regex

expression description
^ Matches the beginning of a line
$ Matches the end of the line
. Matches any character
\s Matches whitespace
\S Matches any non-whitespace character
* Repeats a character zero or more times
*? Repeats a character zero or more times (non-greedy)
+ Repeats a character one or more times
+? Repeats a character one or more times (non-greedy)
[aeiou] Matches a single character in the listed set
[^XYZ] Matches a single character not in the listed set
[a-z0-9] The set of characters can include a range
( Indicates where string extraction is to start
) Indicates where string extraction is to end

methods

method description
re.findall(pattern, string) returns a list of strings
re.search(pattern, string) returns a Match object (which is truthy)
re.split(pattern, string, maxsplit=0) returns a list of strings
re.sub(pattern, substitute, string) returns a string

findall

import re
alphabet = 'abcdefghijklmnopqrstuvwxyz'
re.findall('[aeiou]', alphabet)
#  ['a', 'e', 'i', 'o', 'u']
re.findall('[aeiou].+[aeiou]', alphabet)  # greedy
#  ['abcdefghijklmnopqrstu']
re.findall('[aeiou].+?[aeiou]', alphabet)  # non-greedy
#  ['abcde', 'ijklmno']
import re
re.findall('[aeiou]', 'abcde') # returns a list of all sub-strings matching the regular expression
#  ['a', 'e']
re.findall('.+:', 'a:b:c:d:e') # greedy (prefers the longest match)
#  ['a:b:c:d:']
re.findall('.+?:', 'a:b:c:d:e') # non-greedy (prefers the shortest match)
#  ['a:', 'b:', 'c:', 'd:']
import re
re.findall('From \S+@\S+', 'From name@domain blah blah')
#  ['From name@domain']
re.findall('From (\S+@\S+)', 'From name@domain blah blah')
#  ['name@domain']
re.findall('\S+?@\S+', 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008')
#  ['stephen.marquard@uct.ac.za']
re.findall('\S+@\S+', 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008')
#  ['stephen.marquard@uct.ac.za']

split

import re
alphabet = 'abcdefghijklmnopqrstuvwxyz'
re.split('[aeiou]', alphabet)
#  ['', 'bcd', 'fgh', 'jklmn', 'pqrst', 'vwxyz']
re.split('[^aeiou]+', alphabet)
#  ['a', 'e', 'i', 'o', 'u', '']
re.split('[^aeiou]+', alphabet, maxsplit=1)
#  ['a', 'efghijklmnopqrstuvwxyz']

sub

import re
alphabet = 'abcdefghijklmnopqrstuvwxyz'
re.sub('[aeiou]', "_", alphabet)
#  '_bcd_fgh_jklmn_pqrst_vwxyz'
re.sub('[^aeiou]', "_", alphabet)
#  'a___e___i_____o_____u_____'
re.sub('[^aeiou]+', "_", alphabet)
#  'a_e_i_o_u_'

appendix: dir

from pprint import pprint
import re
pprint(dir(re))
#  ['A',
#   'ASCII',
#   'DEBUG',
#   'DOTALL',
#   'I',
#   'IGNORECASE',
#   'L',
#   'LOCALE',
#   'M',
#   'MULTILINE',
#   'Match',
#   'Pattern',
#   'RegexFlag',
#   'S',
#   'Scanner',
#   'T',
#   'TEMPLATE',
#   'U',
#   'UNICODE',
#   'VERBOSE',
#   'X',
#   '_MAXCACHE',
#   '__all__',
#   '__builtins__',
#   '__cached__',
#   '__doc__',
#   '__file__',
#   '__loader__',
#   '__name__',
#   '__package__',
#   '__spec__',
#   '__version__',
#   '_cache',
#   '_compile',
#   '_compile_repl',
#   '_expand',
#   '_locale',
#   '_pickle',
#   '_special_chars_map',
#   '_subx',
#   'compile',
#   'copyreg',
#   'enum',
#   'error',
#   'escape',
#   'findall',
#   'finditer',
#   'fullmatch',
#   'functools',
#   'match',
#   'purge',
#   'search',
#   'split',
#   'sre_compile',
#   'sre_parse',
#   'sub',
#   'subn',
#   'template']